package com.uplooking.spider.parser;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.springframework.stereotype.Component;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

@Component
public class PageListParser {
    private HtmlCleaner htmlCleaner = new HtmlCleaner();

    public List<String> parse(String html) throws XPatherException {
        TagNode tagNode = htmlCleaner.clean(html);
        Object[] objects = tagNode.evaluateXPath("//a[@class='j_th_tit ']/@href");
        System.out.println(objects);
        List<String> arrayList = new ArrayList<>();
        for (Object object : objects) {
            arrayList.add((String) object);
        }
        return arrayList;
    }
}
